This document was compiled on the 2024-11-14 14:38:34.792318 by carol.

1 Introduction

The data (Database_Hendrickx_2019_Dentes Poyos.xlsx) was obtained thanks to Elisabete Malafaia (EM), on the 31/07/2024, via external memory to Carolina Marques (CM).

The data contains the information of several parameters obtained from measurements of theropod teeth and most of them are explained in the following schemes:

Fig 1: First scheme with the variabes obtained from the theropods teeth.
Fig 1: First scheme with the variabes obtained from the theropods teeth.
Fig 2: Second scheme with the variabes obtained from the theropods teeth, in real teeth.
Fig 2: Second scheme with the variabes obtained from the theropods teeth, in real teeth.
Fig 3: Third scheme with the variabes obtained from the theropods teeth, in real teeth.
Fig 3: Third scheme with the variabes obtained from the theropods teeth, in real teeth.

All of the above schemes come from Hendrickx, Mateus, and Araújo (2015)

2 Reading the data

#data1 <- read_xlsx("Database_Hendrickx_2019_Dentes Poyos_Informacao idade.xlsx")
#dd<-data.table(Epoch=data1$Epoch,"Taxa (Genus)"=data1$`Taxa (Genus)`)
#dd<-dd[!duplicated(dd),]
data <- read_xlsx("Crown measurement dataset Kem Kem theropods.xlsx")

data[data == "?"] <- NA
data[data == "~"] <- NA
data[data == "/"] <- NA
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub("\\? ","", x)
})

data[] <- lapply(data, function(x) {
  gsub("absent",0, x)
})

# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub("\\?","", x)
})


# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub("\\>","", x)
})

# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub("\\<","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub(">","", x)
})

# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub("<","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub("\\~","", x)
})

# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub("~","", x)
})

# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub(":","", x)
})
# Remove "?" from all columns
data[] <- lapply(data, function(x) {
  gsub(";","", x)
})
data$LIF<- ifelse(data$LIF=="6-7",6.5,
                  ifelse(data$LIF=="5-6",5.5,
                         ifelse(data$LIF=="4-5",4.5,
                                ifelse(data$LIF=="3-4","3.5",
                                                           ifelse(data$LIF=="11 or 12",11.5,
                                                                  ifelse(data$LIF=="10-13",12,data$LIF))))))
data$CH<-data$CH...22 
data<- data %>% select(-CH...60,-CH...22,-`(DDL/CH)*100`)
data1<-data
data<-data[,-c(1,2,5,4,6:14,16:19)]#until 19
#clade: 4, taxa:2, teethtaxa: 3, cladetteth: 5, epoch:15
#data<-inner_join(dd,data)
#data<-data[!duplicated(data),]

data$`TransvUndu`<-ifelse(data$`Transv. Undu.`!=0 & !is.na(data$`Transv. Undu.`),1,data$`Transv. Undu.`)

data$`Interdentsulci`<-ifelse(data$`Interdent. sulci`!=0 & !is.na(data$`Interdent. sulci`),1,data$`Interdent. sulci`)

data$LAF<-ifelse(data$LAF=="6-7",6.5,data$LAF)

data$CTU1 <- sub(".*?(\\d+).*", "\\1", data$CTU)

data<- data %>% select(-CTU,-`Interdent. sulci`,-`Transv. Undu.`)

# Convert columns to numeric, then create log-transformed columns
data <- data %>%
  mutate(across(3:ncol(data), as.numeric)) %>%
  mutate(across(3:ncol(data), log, .names = "Log_{.col}"))

data$TaxonToothtype<-as.factor(data$TaxonToothtype)
data$Epoch<-ifelse(data$Epoch=="'Middle Cretaceous'","Middle Cretaceous", data$Epoch)
data$Epoch<-as.factor(data$Epoch)

# Columns to be checked
#columns_to_check <- c("MA", "MC", "MB", "DA", "DC", "DB", "MAVG", "DAVG", "DSDI")

# Replace values equal to 100 with 0 in the specified columns
#data[columns_to_check] <- lapply(data[columns_to_check], function(x) {
 # x[x == 100] <- 0
  #return(x)
#})

#data$Taxa<-as.factor(paste0(data$`Taxa (Genus)`,data$Maturity,sep=" "))
data<-data.frame(data)

3 Checking the data

3.1 Summary of the table

summary(data)
##                    TaxonToothtype               Epoch          CBL        
##  Saurornitholestes lateral:133    Late Cretaceous  :724   Min.   : 0.380  
##  Tyrannosaurus lateral    :114    Middle Cretaceous:238   1st Qu.: 4.282  
##  Acrocanthosaurus lateral : 48    Late Jurassic    :205   Median : 9.950  
##  Richardoestesia lateral  : 46    Early Cretaceous : 81   Mean   :13.916  
##  Majungasaurus  lateral   : 41    Late Triassic    : 57   3rd Qu.:19.782  
##  Pectinodon lateral       : 40    Middle Jurassic  : 55   Max.   :54.500  
##  (Other)                  :949    (Other)          : 11   NA's   :3       
##       CBW               AL              CBR              CHR       
##  Min.   : 0.540   Min.   :  0.55   Min.   :0.2500   Min.   :0.400  
##  1st Qu.: 2.300   1st Qu.: 12.40   1st Qu.:0.4598   1st Qu.:1.634  
##  Median : 6.100   Median : 29.82   Median :0.5420   Median :1.908  
##  Mean   : 9.074   Mean   : 37.19   Mean   :0.5920   Mean   :1.935  
##  3rd Qu.:13.430   3rd Qu.: 55.15   3rd Qu.:0.6895   3rd Qu.:2.191  
##  Max.   :48.600   Max.   :152.84   Max.   :2.1841   Max.   :4.222  
##  NA's   :69       NA's   :338      NA's   :81       NA's   :14     
##       MCL             MCW              MCR              MDE         
##  Min.   : 0.32   Min.   : 0.940   Min.   :0.3841   Min.   :-13.880  
##  1st Qu.: 6.69   1st Qu.: 4.480   1st Qu.:0.5000   1st Qu.:  0.000  
##  Median :12.13   Median : 7.390   Median :0.5745   Median :  0.000  
##  Mean   :13.02   Mean   : 8.274   Mean   :0.6087   Mean   :  5.636  
##  3rd Qu.:17.98   3rd Qu.:10.960   3rd Qu.:0.6818   3rd Qu.:  8.360  
##  Max.   :37.10   Max.   :30.200   Max.   :1.2792   Max.   : 58.400  
##  NA's   :876     NA's   :906      NA's   :907      NA's   :1024     
##       MSL              MEC              LAF               LIF         
##  Min.   :  1.42   Min.   :  0.00   Min.   : 0.0000   Min.   : 0.0000  
##  1st Qu.: 17.55   1st Qu.: 68.74   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median : 28.02   Median :100.00   Median : 0.0000   Median : 0.0000  
##  Mean   : 32.77   Mean   : 83.93   Mean   : 0.3948   Mean   : 0.4748  
##  3rd Qu.: 44.01   3rd Qu.:100.00   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :123.63   Max.   :113.69   Max.   :15.0000   Max.   :15.0000  
##  NA's   :1025     NA's   :1031     NA's   :739       NA's   :755      
##       DMT             DDT              DLAT            DLIT      
##  Min.   :0.100   Min.   : 0.100   Min.   :0.100   Min.   :0.100  
##  1st Qu.:1.400   1st Qu.: 1.250   1st Qu.:1.000   1st Qu.:1.075  
##  Median :2.200   Median : 3.000   Median :2.400   Median :2.200  
##  Mean   :2.888   Mean   : 3.152   Mean   :2.625   Mean   :2.432  
##  3rd Qu.:4.485   3rd Qu.: 4.255   3rd Qu.:4.000   3rd Qu.:3.345  
##  Max.   :8.500   Max.   :10.320   Max.   :8.140   Max.   :7.950  
##  NA's   :1324    NA's   :1324     NA's   :1322    NA's   :1324   
##        CA             CA2               MA              MC       
##  Min.   : 8.50   Min.   :-1.120   Min.   : 4.66   Min.   : 4.70  
##  1st Qu.:68.27   1st Qu.:-0.080   1st Qu.: 9.00   1st Qu.: 9.25  
##  Median :83.22   Median : 0.010   Median :11.25   Median :12.00  
##  Mean   :74.87   Mean   : 0.002   Mean   :13.76   Mean   :16.09  
##  3rd Qu.:86.28   3rd Qu.: 0.100   3rd Qu.:14.00   3rd Qu.:19.00  
##  Max.   :88.11   Max.   : 0.360   Max.   :60.00   Max.   :57.90  
##  NA's   :1004    NA's   :1026     NA's   :888     NA's   :590    
##        MB              DA              DC              DB            MAVG      
##  Min.   : 6.00   Min.   : 4.00   Min.   : 0.00   Min.   : 6.0   Min.   : 0.00  
##  1st Qu.:11.00   1st Qu.: 9.50   1st Qu.:10.24   1st Qu.:11.5   1st Qu.: 9.20  
##  Median :13.00   Median :12.00   Median :15.00   Median :14.8   Median :12.00  
##  Mean   :14.25   Mean   :13.89   Mean   :17.79   Mean   :16.5   Mean   :14.44  
##  3rd Qu.:16.00   3rd Qu.:15.00   3rd Qu.:21.06   3rd Qu.:18.5   3rd Qu.:16.08  
##  Max.   :45.00   Max.   :71.00   Max.   :70.00   Max.   :80.0   Max.   :55.00  
##  NA's   :1015    NA's   :790     NA's   :191     NA's   :845    NA's   :646    
##       DAVG           DAVG2              TDD              DSDI         
##  Min.   : 1.56   Min.   :-0.9200   Min.   :  0.20   Min.   :  0.6654  
##  1st Qu.: 7.65   1st Qu.:-0.0940   1st Qu.: 36.00   1st Qu.:  0.9329  
##  Median :11.50   Median : 0.0100   Median : 56.80   Median :  1.0000  
##  Mean   :13.69   Mean   : 0.8887   Mean   : 77.66   Mean   :  3.7840  
##  3rd Qu.:16.25   3rd Qu.: 0.1260   3rd Qu.:108.03   3rd Qu.:  1.1157  
##  Max.   :80.00   Max.   :35.0000   Max.   :368.62   Max.   :269.8500  
##  NA's   :284     NA's   :1056      NA's   :293      NA's   :737       
##       CMA              CAA              CDA              MDL        
##  Min.   : 16.11   Min.   : 3.151   Min.   : 17.79   Min.   :0.0864  
##  1st Qu.: 58.12   1st Qu.:23.483   1st Qu.: 80.08   1st Qu.:0.2632  
##  Median : 64.83   Median :27.066   Median : 86.92   Median :0.4167  
##  Mean   : 64.58   Mean   :27.332   Mean   : 88.09   Mean   :0.4079  
##  3rd Qu.: 71.52   3rd Qu.:31.122   3rd Qu.: 94.37   3rd Qu.:0.5405  
##  Max.   :148.96   Max.   :74.262   Max.   :160.74   Max.   :1.0638  
##  NA's   :443      NA's   :438      NA's   :439      NA's   :590     
##       DDL              ...61            CH            TransvUndu    
##  Min.   :0.07143   Min.   : NA    Min.   :  0.570   Min.   :0.0000  
##  1st Qu.:0.23739   1st Qu.: NA    1st Qu.:  7.707   1st Qu.:0.0000  
##  Median :0.33333   Median : NA    Median : 18.405   Median :1.0000  
##  Mean   :0.36248   Mean   :NaN    Mean   : 28.030   Mean   :0.5149  
##  3rd Qu.:0.48603   3rd Qu.: NA    3rd Qu.: 40.862   3rd Qu.:1.0000  
##  Max.   :1.11111   Max.   : NA    Max.   :145.550   Max.   :1.0000  
##  NA's   :191       NA's   :1371   NA's   :7         NA's   :969     
##  Interdentsulci        CTU1         Log_CBL           Log_CBW       
##  Min.   :0.0000   Min.   : 0.0   Min.   :-0.9676   Min.   :-0.6162  
##  1st Qu.:0.0000   1st Qu.: 0.0   1st Qu.: 1.4545   1st Qu.: 0.8329  
##  Median :0.0000   Median : 2.0   Median : 2.2976   Median : 1.8083  
##  Mean   :0.4187   Mean   : 1.5   Mean   : 2.1924   Mean   : 1.7243  
##  3rd Qu.:1.0000   3rd Qu.: 3.0   3rd Qu.: 2.9848   3rd Qu.: 2.5975  
##  Max.   :1.0000   Max.   :10.0   Max.   : 3.9982   Max.   : 3.8836  
##  NA's   :996      NA's   :969    NA's   :3         NA's   :69       
##      Log_AL           Log_CBR           Log_CHR           Log_MCL      
##  Min.   :-0.5978   Min.   :-1.3863   Min.   :-0.9163   Min.   :-1.139  
##  1st Qu.: 2.5177   1st Qu.:-0.7769   1st Qu.: 0.4912   1st Qu.: 1.901  
##  Median : 3.3952   Median :-0.6125   Median : 0.6461   Median : 2.496  
##  Mean   : 3.1984   Mean   :-0.5745   Mean   : 0.6339   Mean   : 2.336  
##  3rd Qu.: 4.0101   3rd Qu.:-0.3718   3rd Qu.: 0.7846   3rd Qu.: 2.889  
##  Max.   : 5.0294   Max.   : 0.7812   Max.   : 1.4404   Max.   : 3.614  
##  NA's   :338       NA's   :81        NA's   :14        NA's   :876     
##     Log_MCW           Log_MCR           Log_MDE         Log_MSL      
##  Min.   :-0.0619   Min.   :-0.9570   Min.   : -Inf   Min.   :0.3507  
##  1st Qu.: 1.4996   1st Qu.:-0.6931   1st Qu.: -Inf   1st Qu.:2.8650  
##  Median : 2.0001   Median :-0.5543   Median : -Inf   Median :3.3331  
##  Mean   : 1.9078   Mean   :-0.5228   Mean   : -Inf   Mean   :3.2569  
##  3rd Qu.: 2.3943   3rd Qu.:-0.3830   3rd Qu.:2.176   3rd Qu.:3.7843  
##  Max.   : 3.4078   Max.   : 0.2462   Max.   :4.067   Max.   :4.8173  
##  NA's   :906       NA's   :907       NA's   :1044    NA's   :1025    
##     Log_MEC         Log_LAF         Log_LIF         Log_DMT       
##  Min.   : -Inf   Min.   : -Inf   Min.   : -Inf   Min.   :-2.3026  
##  1st Qu.:4.230   1st Qu.: -Inf   1st Qu.: -Inf   1st Qu.: 0.3365  
##  Median :4.605   Median : -Inf   Median : -Inf   Median : 0.7885  
##  Mean   : -Inf   Mean   : -Inf   Mean   : -Inf   Mean   : 0.7166  
##  3rd Qu.:4.605   3rd Qu.: -Inf   3rd Qu.: -Inf   3rd Qu.: 1.5007  
##  Max.   :4.734   Max.   :2.708   Max.   :2.708   Max.   : 2.1401  
##  NA's   :1031    NA's   :739     NA's   :755     NA's   :1324     
##     Log_DDT           Log_DLAT          Log_DLIT           Log_CA     
##  Min.   :-2.3026   Min.   :-2.3026   Min.   :-2.3026   Min.   :2.140  
##  1st Qu.: 0.2223   1st Qu.: 0.0000   1st Qu.: 0.0721   1st Qu.:4.223  
##  Median : 1.0986   Median : 0.8755   Median : 0.7885   Median :4.421  
##  Mean   : 0.7985   Mean   : 0.5680   Mean   : 0.5162   Mean   :4.272  
##  3rd Qu.: 1.4478   3rd Qu.: 1.3863   3rd Qu.: 1.2074   3rd Qu.:4.457  
##  Max.   : 2.3341   Max.   : 2.0968   Max.   : 2.0732   Max.   :4.479  
##  NA's   :1324      NA's   :1322      NA's   :1324      NA's   :1004   
##     Log_CA2           Log_MA          Log_MC          Log_MB     
##  Min.   :  -Inf   Min.   :1.539   Min.   :1.548   Min.   :1.792  
##  1st Qu.:-3.219   1st Qu.:2.197   1st Qu.:2.225   1st Qu.:2.398  
##  Median :-2.408   Median :2.420   Median :2.485   Median :2.565  
##  Mean   :  -Inf   Mean   :2.498   Mean   :2.629   Mean   :2.608  
##  3rd Qu.:-1.897   3rd Qu.:2.639   3rd Qu.:2.944   3rd Qu.:2.773  
##  Max.   :-1.022   Max.   :4.094   Max.   :4.059   Max.   :3.807  
##  NA's   :1182     NA's   :888     NA's   :590     NA's   :1015   
##      Log_DA          Log_DC          Log_DB         Log_MAVG    
##  Min.   :1.386   Min.   : -Inf   Min.   :1.792   Min.   : -Inf  
##  1st Qu.:2.251   1st Qu.:2.326   1st Qu.:2.442   1st Qu.:2.219  
##  Median :2.485   Median :2.708   Median :2.695   Median :2.485  
##  Mean   :2.527   Mean   : -Inf   Mean   :2.721   Mean   : -Inf  
##  3rd Qu.:2.708   3rd Qu.:3.047   3rd Qu.:2.918   3rd Qu.:2.778  
##  Max.   :4.263   Max.   :4.248   Max.   :4.382   Max.   :4.007  
##  NA's   :790     NA's   :191     NA's   :845     NA's   :646    
##     Log_DAVG        Log_DAVG2         Log_TDD          Log_DSDI      
##  Min.   :0.4447   Min.   :  -Inf   Min.   :-1.609   Min.   :-0.4074  
##  1st Qu.:2.0347   1st Qu.:-2.996   1st Qu.: 3.584   1st Qu.:-0.0694  
##  Median :2.4423   Median :-2.278   Median : 4.040   Median : 0.0000  
##  Mean   :2.3747   Mean   :  -Inf   Mean   : 4.039   Mean   : 0.1028  
##  3rd Qu.:2.7879   3rd Qu.:-1.561   3rd Qu.: 4.682   3rd Qu.: 0.1095  
##  Max.   :4.3820   Max.   : 3.555   Max.   : 5.910   Max.   : 5.5979  
##  NA's   :284      NA's   :1195     NA's   :293      NA's   :737      
##     Log_CMA         Log_CAA         Log_CDA         Log_MDL       
##  Min.   :2.779   Min.   :1.148   Min.   :2.879   Min.   :-2.4493  
##  1st Qu.:4.062   1st Qu.:3.156   1st Qu.:4.383   1st Qu.:-1.3350  
##  Median :4.172   Median :3.298   Median :4.465   Median :-0.8755  
##  Mean   :4.149   Mean   :3.275   Mean   :4.467   Mean   :-1.0193  
##  3rd Qu.:4.270   3rd Qu.:3.438   3rd Qu.:4.547   3rd Qu.:-0.6152  
##  Max.   :5.004   Max.   :4.308   Max.   :5.080   Max.   : 0.0619  
##  NA's   :443     NA's   :438     NA's   :439     NA's   :590      
##     Log_DDL          Log_...61        Log_CH        Log_TransvUndu
##  Min.   :-2.6391   Min.   : NA    Min.   :-0.5621   Min.   :-Inf  
##  1st Qu.:-1.4380   1st Qu.: NA    1st Qu.: 2.0422   1st Qu.:-Inf  
##  Median :-1.0986   Median : NA    Median : 2.9126   Median :   0  
##  Mean   :-1.1308   Mean   :NaN    Mean   : 2.8310   Mean   :-Inf  
##  3rd Qu.:-0.7215   3rd Qu.: NA    3rd Qu.: 3.7102   3rd Qu.:   0  
##  Max.   : 0.1054   Max.   : NA    Max.   : 4.9805   Max.   :   0  
##  NA's   :191       NA's   :1371   NA's   :7         NA's   :969   
##  Log_Interdentsulci    Log_CTU1     
##  Min.   :-Inf       Min.   :  -Inf  
##  1st Qu.:-Inf       1st Qu.:  -Inf  
##  Median :-Inf       Median :0.6931  
##  Mean   :-Inf       Mean   :  -Inf  
##  3rd Qu.:   0       3rd Qu.:1.0986  
##  Max.   :   0       Max.   :2.3026  
##  NA's   :996        NA's   :969

3.2 Check first rows

head(data)

3.3 Tables

3.3.1 Genus / Taxa Table

taxa<-table(data$TaxonToothtype)
data_taxa<-data.frame(taxa)
data_taxa<-data_taxa[order(data_taxa$Freq, decreasing = T), ]
data_taxa$ID<-1:nrow(data_taxa)
data_taxa

4 Data Processing

4.1 Removing columns that have more more missing value

# Count the number of missing values in each column
missing_counts <- colSums(is.na(data))

# Remove columns with more than 15% missing values
data1_cleaned <- data[, missing_counts <= nrow(data)*0.15]

4.2 Removing rows with NA values

# Remove rows with any NA values
data1_cleaned <- na.omit(data1_cleaned)
data1_cleaned

4.3 Subtracting the observations for the Taxa that have least observations

lennn<-(ncol(data1_cleaned)-2)/2
taxa1<-table(data1_cleaned$TaxonToothtype)
data1_cleanedd<-data.frame(taxa1)
data1_cleanedd<-data1_cleanedd[order(data1_cleanedd$Freq, decreasing = T), ]
data1_cleanedd$ID<-1:nrow(data1_cleanedd)
data1_cleanedd$TaxonToothtype<-data1_cleanedd$Var1

data1_cleaned1<-data1_cleanedd[data1_cleanedd$Freq>lennn,]

data1_cleaned<-data1_cleaned[data1_cleaned$TaxonToothtype%in%unique(data1_cleaned1$TaxonToothtype),]

summary(data1_cleaned)
##                    TaxonToothtype               Epoch          CBL        
##  Saurornitholestes lateral:133    Late Cretaceous  :596   Min.   : 1.400  
##  Tyrannosaurus lateral    :106    Late Jurassic    :135   1st Qu.: 4.775  
##  Richardoestesia lateral  : 45    Middle Cretaceous:118   Median :10.910  
##  Acrocanthosaurus lateral : 42    Early Cretaceous : 40   Mean   :15.165  
##  Majungasaurus  lateral   : 40    Late Triassic    : 25   3rd Qu.:22.465  
##  Pectinodon lateral       : 40    Middle Jurassic  : 25   Max.   :54.500  
##  (Other)                  :533    (Other)          :  0                   
##       CBW              CBR              CHR              DC       
##  Min.   : 0.600   Min.   :0.2500   Min.   :0.400   Min.   : 4.50  
##  1st Qu.: 2.200   1st Qu.:0.4480   1st Qu.:1.634   1st Qu.:10.00  
##  Median : 5.700   Median :0.5200   Median :1.905   Median :15.00  
##  Mean   : 9.409   Mean   :0.5676   Mean   :1.913   Mean   :17.36  
##  3rd Qu.:14.480   3rd Qu.:0.6516   3rd Qu.:2.179   3rd Qu.:20.20  
##  Max.   :48.600   Max.   :2.1840   Max.   :3.509   Max.   :60.00  
##                                                                   
##       DDL                CH            Log_CBL          Log_CBW       
##  Min.   :0.08333   Min.   :  2.20   Min.   :0.3365   Min.   :-0.5108  
##  1st Qu.:0.24752   1st Qu.:  8.50   1st Qu.:1.5634   1st Qu.: 0.7885  
##  Median :0.33333   Median : 19.10   Median :2.3897   Median : 1.7405  
##  Mean   :0.36771   Mean   : 30.48   Mean   :2.3158   Mean   : 1.6997  
##  3rd Qu.:0.50000   3rd Qu.: 45.33   3rd Qu.:3.1120   3rd Qu.: 2.6728  
##  Max.   :1.11111   Max.   :145.55   Max.   :3.9982   Max.   : 3.8836  
##                                                                       
##     Log_CBR           Log_CHR            Log_DC         Log_DDL       
##  Min.   :-1.3863   Min.   :-0.9163   Min.   :1.504   Min.   :-2.4849  
##  1st Qu.:-0.8029   1st Qu.: 0.4912   1st Qu.:2.303   1st Qu.:-1.3962  
##  Median :-0.6539   Median : 0.6444   Median :2.708   Median :-1.0986  
##  Mean   :-0.6149   Mean   : 0.6260   Mean   :2.723   Mean   :-1.1136  
##  3rd Qu.:-0.4283   3rd Qu.: 0.7787   3rd Qu.:3.006   3rd Qu.:-0.6931  
##  Max.   : 0.7812   Max.   : 1.2554   Max.   :4.094   Max.   : 0.1054  
##                                                                       
##      Log_CH      
##  Min.   :0.7885  
##  1st Qu.:2.1401  
##  Median :2.9497  
##  Mean   :2.9494  
##  3rd Qu.:3.8140  
##  Max.   :4.9805  
## 

4.4 Spliting the Log variables and the original

# Select variables that contain "log" and the first column
selected_cols <- c(1,2, grep("Log", names(data1_cleaned)))

# Subset the data frame
data_log <- data1_cleaned[, selected_cols]

data_log
names(data_log)[-1]  <- gsub(" ", "_", names(data_log)[-1] )


# Identify columns that contain "log"
log_cols <- grep("Log", names(data1_cleaned))

# Include the first column
cols_to_keep <- setdiff(1:ncol(data1_cleaned), log_cols)

# Ensure the first column is included
cols_to_keep <- union(1, cols_to_keep)

# Subset the data frame
data_original <- data1_cleaned[, cols_to_keep]

data_original

5 Visualization

5.1 Original data

correlation_matrix <- cor(data_original[, -c(1, 2)])
# Plot correlation matrix
corrplot(correlation_matrix, method = "color", type = "lower", 
         addCoef.col = "black", 
         tl.col = "black", 
         tl.srt = 45, 
         diag = FALSE, 
         order = "hclust", 
         col = colorRampPalette(c("blue", "white", "red"))(200))

5.2 Log data

correlation_matrix <- cor(data_log[, -c(1, 2)])
# Plot correlation matrix
corrplot(correlation_matrix, method = "color", type = "lower", 
         addCoef.col = "black", 
         tl.col = "black", 
         tl.srt = 45, 
         diag = FALSE, 
         order = "hclust", 
         col = colorRampPalette(c("blue", "white", "red"))(200))

colnames(data_original)
## [1] "TaxonToothtype" "Epoch"          "CBL"            "CBW"           
## [5] "CBR"            "CHR"            "DC"             "DDL"           
## [9] "CH"
# Filter out taxa with less than 10 observations
data_log <- data_log %>%
  group_by(TaxonToothtype) %>%                   # Group by 'Taxa' column
  filter(n() >= 10) %>%                # Keep only groups with 10 or more observations
  ungroup()                            # Ungroup after filtering
# Filter out taxa with less than 10 observations
data_original <- data_original %>%
  group_by(TaxonToothtype) %>%                   # Group by 'Taxa' column
  filter(n() >= 10) %>%                # Keep only groups with 10 or more observations
  ungroup() 
# Get the count of each unique value in the column
category_counts <- table(data_log$TaxonToothtype)

# Filter unique values that have more than 0 observations
unique_values <- names(category_counts[category_counts > 0])

# Print the result
#print(unique_values)

6 Merging data clade with tooth to add more classes

data1$TaxonToothtype<-ifelse(!(data1$TaxonToothtype%in%unique_values),data1$`Taxa (Genus)`,data1$TaxonToothtype)

data1<-data1[,-c(1,2,5,4,6:14,16:19)]#until 19
#clade: 4, taxa:2, teethtaxa: 3, cladetteth: 5, epoch:15
#data<-inner_join(dd,data)
#data<-data[!duplicated(data),]

data1$`TransvUndu`<-ifelse(data1$`Transv. Undu.`!=0 & !is.na(data1$`Transv. Undu.`),1,data1$`Transv. Undu.`)

data1$`Interdentsulci`<-ifelse(data1$`Interdent. sulci`!=0 & !is.na(data1$`Interdent. sulci`),1,data1$`Interdent. sulci`)

data1$LAF<-ifelse(data1$LAF=="6-7",6.5,data1$LAF)

data1$CTU1 <- sub(".*?(\\d+).*", "\\1", data1$CTU)

data1<- data1 %>% select(-CTU,-`Interdent. sulci`,-`Transv. Undu.`)

# Convert columns to numeric, then create log-transformed columns
data1 <- data1 %>%
  mutate(across(3:ncol(data1), as.numeric)) %>%
  mutate(across(3:ncol(data1), log, .names = "Log_{.col}"))
## Warning: There were 17 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(3:ncol(data1), as.numeric)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 16 remaining warnings.
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `across(3:ncol(data1), log, .names = "Log_{.col}")`.
## Caused by warning:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
data1$TaxonToothtype<-as.factor(data1$TaxonToothtype)
data1$Epoch<-ifelse(data1$Epoch=="'Middle Cretaceous'","Middle Cretaceous", data1$Epoch)
data1$Epoch<-as.factor(data1$Epoch)


#data$Taxa<-as.factor(paste0(data$`Taxa (Genus)`,data$Maturity,sep=" "))
data1<-data.frame(data1)



# Count the number of missing values in each column
missing_counts <- colSums(is.na(data1))

# Remove columns with more than 15% missing values
data1_cleaned <- data1[, missing_counts <= nrow(data1)*0.15]

# Remove rows with any NA values
data1_cleaned <- na.omit(data1_cleaned)
data1_cleaned
lennn<-(ncol(data1_cleaned)-2)/2
taxa1<-table(data1_cleaned$TaxonToothtype)
data1_cleanedd<-data.frame(taxa1)
data1_cleanedd<-data1_cleanedd[order(data1_cleanedd$Freq, decreasing = T), ]
data1_cleanedd$ID<-1:nrow(data1_cleanedd)
data1_cleanedd$TaxonToothtype<-data1_cleanedd$Var1

data1_cleaned1<-data1_cleanedd[data1_cleanedd$Freq>lennn,]

data1_cleaned<-data1_cleaned[data1_cleaned$TaxonToothtype%in%unique(data1_cleaned1$TaxonToothtype),]

summary(data1_cleaned)
##                    TaxonToothtype               Epoch          CBL       
##  Saurornitholestes lateral:133    Late Cretaceous  :601   Min.   : 1.40  
##  Tyrannosaurus lateral    :106    Middle Cretaceous:149   1st Qu.: 4.89  
##  Richardoestesia lateral  : 45    Late Jurassic    :137   Median :11.09  
##  Acrocanthosaurus lateral : 42    Early Cretaceous : 40   Mean   :15.11  
##  Majungasaurus  lateral   : 40    Late Triassic    : 29   3rd Qu.:21.99  
##  Pectinodon lateral       : 40    Middle Jurassic  : 28   Max.   :54.50  
##  (Other)                  :578    (Other)          :  0                  
##       CBW              CBR              CHR              DC       
##  Min.   : 0.600   Min.   :0.2500   Min.   :0.400   Min.   : 4.50  
##  1st Qu.: 2.220   1st Qu.:0.4490   1st Qu.:1.646   1st Qu.:10.00  
##  Median : 5.940   Median :0.5233   Median :1.916   Median :15.00  
##  Mean   : 9.342   Mean   :0.5698   Mean   :1.921   Mean   :17.25  
##  3rd Qu.:14.287   3rd Qu.:0.6571   3rd Qu.:2.184   3rd Qu.:20.00  
##  Max.   :48.600   Max.   :2.1840   Max.   :3.509   Max.   :60.00  
##                                                                   
##       DDL                CH             Log_CBL          Log_CBW       
##  Min.   :0.08333   Min.   :  2.200   Min.   :0.3365   Min.   :-0.5108  
##  1st Qu.:0.25000   1st Qu.:  8.678   1st Qu.:1.5872   1st Qu.: 0.7975  
##  Median :0.33333   Median : 19.845   Median :2.4056   Median : 1.7817  
##  Mean   :0.36781   Mean   : 30.396   Mean   :2.3223   Mean   : 1.7102  
##  3rd Qu.:0.50000   3rd Qu.: 44.237   3rd Qu.:3.0907   3rd Qu.: 2.6594  
##  Max.   :1.11111   Max.   :145.550   Max.   :3.9982   Max.   : 3.8836  
##                                                                        
##     Log_CBR           Log_CHR            Log_DC         Log_DDL       
##  Min.   :-1.3863   Min.   :-0.9163   Min.   :1.504   Min.   :-2.4849  
##  1st Qu.:-0.8008   1st Qu.: 0.4981   1st Qu.:2.303   1st Qu.:-1.3863  
##  Median :-0.6477   Median : 0.6502   Median :2.708   Median :-1.0986  
##  Mean   :-0.6109   Mean   : 0.6308   Mean   :2.720   Mean   :-1.1101  
##  3rd Qu.:-0.4199   3rd Qu.: 0.7813   3rd Qu.:2.996   3rd Qu.:-0.6931  
##  Max.   : 0.7812   Max.   : 1.2554   Max.   :4.094   Max.   : 0.1054  
##                                                                       
##      Log_CH      
##  Min.   :0.7885  
##  1st Qu.:2.1607  
##  Median :2.9880  
##  Mean   :2.9603  
##  3rd Qu.:3.7896  
##  Max.   :4.9805  
## 
# Select variables that contain "log" and the first column
selected_cols <- c(1,2, grep("Log", names(data1_cleaned)))

# Subset the data frame
data_log <- data1_cleaned[, selected_cols]

data_log
names(data_log)[-1]  <- gsub(" ", "_", names(data_log)[-1] )


# Identify columns that contain "log"
log_cols <- grep("Log", names(data1_cleaned))

# Include the first column
cols_to_keep <- setdiff(1:ncol(data1_cleaned), log_cols)

# Ensure the first column is included
cols_to_keep <- union(1, cols_to_keep)

# Subset the data frame
data_original <- data1_cleaned[, cols_to_keep]

data_original
colnames(data_original)
## [1] "TaxonToothtype" "Epoch"          "CBL"            "CBW"           
## [5] "CBR"            "CHR"            "DC"             "DDL"           
## [9] "CH"
# Filter out taxa with less than 10 observations
data_log <- data_log %>%
  group_by(TaxonToothtype) %>%                   # Group by 'Taxa' column
  filter(n() >= 10) %>%                # Keep only groups with 10 or more observations
  ungroup()                            # Ungroup after filtering
# Filter out taxa with less than 10 observations
data_original <- data_original %>%
  group_by(TaxonToothtype) %>%                   # Group by 'Taxa' column
  filter(n() >= 10) %>%                # Keep only groups with 10 or more observations
  ungroup() 
# Get the count of each unique value in the column
category_counts <- table(data_log$TaxonToothtype)

# Filter unique values that have more than 0 observations
unique_values1 <- names(category_counts[category_counts > 0])

7 Saving the clean dataset

write.csv(data_log,"teeth_data_log_taxa_epoch1.csv", row.names = FALSE)
write.csv(data_original,"teeth_data_taxa_epoch1.csv", row.names = FALSE)

References

Hendrickx, Christophe, Octávio Mateus, and Ricardo Araújo. 2015. “A Proposed Terminology of Theropod Teeth (Dinosauria, Saurischia).” Journal of Vertebrate Paleontology 35 (5): e982797. https://doi.org/10.1080/02724634.2015.982797.